{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 常用模型"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.linear_model import LinearRegression#线性回归\n",
    "lr=LinearRegression(normalize=True)\n",
    "\n",
    "#SVC，SVR\n",
    "from sklearn.svm import SVC,SVR\n",
    "svc=SVC(kernel='linear')\n",
    "\n",
    "#贝叶斯\n",
    "from sklearn.naive_bayes import GaussianNB\n",
    "gnb=GaussianNB()\n",
    "#KNN\n",
    "from sklearn.neighbors import KNeighborsClassifier\n",
    "knn=KNeighborsClassifier(n_neighbors=2)\n",
    "\n",
    "#无监督模型\n",
    "from sklearn.decomposition import PCA\n",
    "pca=PCA(n_components=0.95)\n",
    "from sklearn.cluster import KMeans\n",
    "k_means=KMeans(n_clusters=6,random_state=1)\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "PCA"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.decomposition import PCA\n",
    "pca = PCA(n_components=2)\n",
    "pca.fit(X_train)\n",
    "X_train_reduction = pca.transform(X_train)\n",
    "X_test_reduction = pca.transform(X_test)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Pipeline"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.pipeline import Pipeline\n",
    "# 传入每一步骤所对应的类 1.多项式的特征 2.数据归一化 3.线性回归\n",
    "poly_reg = Pipeline([\n",
    " (\"poly\", PolynomialFeatures(degree=2)),\n",
    " (\"std_scaler\", StandardScaler()),\n",
    " (\"lin_reg\", LinearRegression())\n",
    "])\n",
    "poly_reg.fit(X, y)\n",
    "y_predict = poly_reg.predict(X)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "###  模型训练"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "lr.fit(x,y)\n",
    "\n",
    "k_means.fit(x,y)\n",
    "pca.fit_transform(x,y)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 模型预测"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "y_pred=lr.predict(x_test)\n",
    "y_pred=knn.predict_proba(x_test)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 模型评估"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "分类\n",
    "准确率"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "from sklearn.metrics import accuracy_score\n",
    "y_pred = [0, 2, 1, 3]\n",
    "y_true = [0, 1, 2, 3]\n",
    "accuracy_score(y_true, y_pred)\n",
    "accuracy_score(y_true, y_pred, normalize=False)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "混淆矩阵"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.metrics import confusion_matrix\n",
    "y_true = [2, 0, 2, 2, 0, 1]\n",
    "y_pred = [0, 0, 2, 2, 0, 2]\n",
    "confusion_matrix(y_true, y_pred)\n",
    "array([[2, 0, 0],\n",
    "       [0, 0, 1],\n",
    "       [1, 0, 2]])\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "分类报告\n",
    "classification_report函数构建了一个文本报告，用于展示主要的分类metrics。 \n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    ">>> from sklearn.metrics import classification_report\n",
    ">>> y_true = [0, 1, 2, 2, 0]\n",
    ">>> y_pred = [0, 0, 2, 2, 0]\n",
    ">>> target_names = ['class 0', 'class 1', 'class 2']\n",
    ">>> print(classification_report(y_true, y_pred, target_names=target_names))\n",
    "             precision    recall  f1-score   support\n",
    " \n",
    "    class 0       0.67      1.00      0.80         2\n",
    "    class 1       0.00      0.00      0.00         1\n",
    "    class 2       1.00      1.00      1.00         2\n",
    " \n",
    "avg / total       0.67      0.80      0.72         5\n",
    "\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "回归"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.metrics import mean_squared_error#简称MSE，即均方误差\n",
    "from sklearn.metrics import median_absolute_error#abs(y_pred-y_test)\n",
    "from sklearn.metrics import mean_squared_log_error\n",
    "from sklearn.metrics import mean_absolute_error#(MAE)\n",
    "from sklearn.metrics import r2_score\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### 聚类\n",
    "轮廓系数（Silhouette Coefficient）\n",
    "函数：\n",
    "\n",
    "def silhouette_score(X, labels, metric=‘euclidean’, sample_size=None,\n",
    "random_state=None, **kwds)："
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "函数值说明：\n",
    "所有样本的s i 的均值称为聚类结果的轮廓系数，定义为S，是该聚类是否合理、有效的度量。聚类结果的轮廓系数的取值在【-1,1】之间，值越大，说明同类样本相距约近，不同样本相距越远，则聚类效果越好。"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    ">>> import numpy as np\n",
    ">>> from sklearn.cluster import KMeans\n",
    ">>> kmeans_model = KMeans(n_clusters=3, random_state=1).fit(X)\n",
    ">>> labels = kmeans_model.labels_\n",
    ">>> metrics.silhouette_score(X, labels, metric='euclidean')\n",
    "...                                                      \n",
    "0.55...\n",
    "\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "CH分数（Calinski Harabasz Score ）\n",
    "函数：\n",
    "def calinski_harabasz_score(X, labels):"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "函数值说明：\n",
    "类别内部数据的协方差越小越好，类别之间的协方差越大越好，这样的Calinski-Harabasz分数会高。 总结起来一句话：CH index的数值越大越好。"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    ">>> import numpy as np\n",
    ">>> from sklearn.cluster import KMeans\n",
    ">>> kmeans_model = KMeans(n_clusters=3, random_state=1).fit(X)\n",
    ">>> labels = kmeans_model.labels_\n",
    ">>> metrics.calinski_harabaz_score(X, labels)  \n",
    "560.39...\n",
    "\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 交叉验证"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.model_selection import cross_val_score\n",
    "from sklearn import datasets\n",
    "data=datasets.load_iris()\n",
    "X=data.data\n",
    "y=data.target\n",
    "knn = KNeighborsClassifier(n_neighbors=5)\n",
    "score = cross_val_score(knn,X,y,cv=5,scoring='accuracy')\n",
    "print(score)\n",
    "print(score.mean())\n",
    "\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 网格搜索\n",
    "GridSearchCV，它存在的意义就是自动调参，只要把参数输进去，就能给出最优化的结果和参数。但是这个方法适合于小数据集，一旦数据的量级上去了，很难得出结果。这个时候就是需要动脑筋了。数据量比较大的时候可以使用一个快速调优的方法——坐标下降。它其实是一种贪心算法：拿当前对模型影响最大的参数调优，直到最优化；再拿下一个影响最大的参数调优，如此下去，直到所有的参数调整完毕。这个方法的缺点就是可能会调到局部最优而不是全局最优，但是省时间省力，巨大的优势面前，还是试一试吧，后续可以再拿bagging再优化。\n",
    "\n",
    "\n",
    "grid.fit()：运行网格搜索\n",
    "\n",
    "best_params_：描述了已取得最佳结果的参数的组合\n",
    "\n",
    "best_score_：成员提供优化过程期间观察到的最好的评分\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.datasets import load_iris\n",
    "from  sklearn.neighbors import KNeighborsClassifier\n",
    "import matplotlib.pyplot as plt\n",
    "from sklearn.model_selection import GridSearchCV\n",
    "iris = load_iris()\n",
    "X = iris.data\n",
    "y = iris.target\n",
    "k_range = range(1,31)\n",
    "weights = ['uniform', 'distance']\n",
    "#\n",
    "param_gird = dict(n_neighbors=k_range,weights=weights)\n",
    " \n",
    "knn = KNeighborsClassifier(n_neighbors=5)#现在是一个参数进行网格搜索\n",
    " \n",
    "grid = GridSearchCV(knn,param_gird,cv=10,scoring='accuracy')\n",
    "grid.fit(X,y)\n",
    "print(grid.best_score_)\n",
    "print(grid.best_params_)\n",
    " \n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.model_selection import GridSearchCV\n",
    "param_grid = [\n",
    " { 'weights':['uniform'],  'n_neighbors':[i for i in range(1, 11)]},\n",
    "{ 'weights':['distance'], 'n_neighbors':[i for i in range(1, 11)],'p':[i for i in range(1, 6)]}\n",
    "]\n",
    "knn_clf = KNeighborsClassifier()\n",
    "grid_search = GridSearchCV(knn_clf, param_grid)\n",
    "grid_search.fit(X_train, y_train)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 归一化和标准化"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "归一化"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.preprocessing import MinMaxScaler\n",
    "mm = MinMaxScaler()\n",
    " \n",
    "data = mm.fit_transform(shuju )\n",
    "print(data)\n",
    "\n",
    "mm.transform(X_test)\n",
    "mm.inverse_transform()#还原\n",
    "\n",
    "\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "标准化"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.preprocessing import StandardScaler\n",
    "standardScaler = StandardScaler()\n",
    "standardScaler.fit(X_train)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 标签编码\n",
    "\n",
    "第一步：先对离散的数字、离散的文本、离散的类别进行编号，使用 LabelEncoder，LabelEncoder会根据取值的种类进行标注。"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import sklearn.preprocessing as pre_processing\n",
    "import numpy as np\n",
    " \n",
    "label=pre_processing.LabelEncoder()\n",
    "labels=label.fit_transform(['中国','美国','法国','德国'])\n",
    "print(labels)\n",
    "\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "第二步：然后进行独热编码,使用OneHotEncoder"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "labels=np.array(labels).reshape(len(labels),1) #先将X组织成（sample，feature）的格式\n",
    " \n",
    "onehot=pre_processing.OneHotEncoder()\n",
    "onehot_label=onehot.fit_transform(labels)\n",
    "print(onehot_label.toarray())   #这里一定要进行toarray()\n",
    "\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "结果为：\n",
    "\n",
    "[[1. 0. 0. 0.]\n",
    " \n",
    "[0. 0. 0. 1.]\n",
    " \n",
    "[0. 0. 1. 0.]\n",
    " \n",
    "[0. 1. 0. 0.]]\n",
    "\n",
    "注意，上面的第二步也可以使用LabelBinarizer进行替代\n",
    "\n",
    "onehot_label=pre_processing.LabelBinarizer().fit_transform(labels)\n",
    "\n",
    "这里的参数labels就是【0，3，2，1】,不需要组织成（samples，features）的形式。"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 多项式特征"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.preprocessing import PolynomialFeatures\n",
    "# include polynomials up to x ** 10:\n",
    "poly = PolynomialFeatures(degree=10)\n",
    "poly.fit(X)\n",
    "X_poly = poly.transform(X)\n"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
